{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Spark SQL 练习：火警呼叫分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkContext, SparkConf\n",
    "from pyspark.sql import SparkSession\n",
    "import pyspark.sql.functions as F\n",
    "import pyspark.sql.types as T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Driver\n",
    "spark = SparkSession.builder.master('local').appName(\"HelloSpark\").getOrCreate()\n",
    "spark.conf.set('spark.sql.repl.eagerEval.enabled', True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create scheme\n",
    "fire_schema = T.StructType([T.StructField(\"CallNumber\", T.IntegerType(), True),\n",
    "                          T.StructField(\"UnitID\", T.StringType(), True),\n",
    "                          T.StructField(\"IncidentNumber\", T.IntegerType(), True),\n",
    "                          T.StructField(\"CallType\", T.StringType(), True),\n",
    "                          T.StructField(\"CallDate\", T.StringType(), True),\n",
    "                          T.StructField(\"WatchDate\", T.StringType(), True),\n",
    "                          T.StructField(\"CallFinalDisposition\", T.StringType(), True),\n",
    "                          T.StructField(\"AvailableDtTm\", T.StringType(), True),\n",
    "                          T.StructField(\"Address\", T.StringType(), True),\n",
    "                          T.StructField(\"City\", T.StringType(), True),\n",
    "                          T.StructField(\"Zipcode\", T.IntegerType(), True),\n",
    "                          T.StructField(\"Battalion\", T.StringType(), True),\n",
    "                          T.StructField(\"StationArea\", T.StringType(), True),\n",
    "                          T.StructField(\"Box\", T.StringType(), True),\n",
    "                          T.StructField(\"OriginalPriority\", T.StringType(), True),\n",
    "                          T.StructField(\"Priority\", T.StringType(), True),\n",
    "                          T.StructField(\"FinalPriority\", T.IntegerType(), True),\n",
    "                          T.StructField(\"ALSUnit\", T.BooleanType(), True),\n",
    "                          T.StructField(\"CallTypeGroup\", T.StringType(), True),\n",
    "                          T.StructField(\"NumAlarms\", T.IntegerType(), True),\n",
    "                          T.StructField(\"UnitType\", T.StringType(), True),\n",
    "                          T.StructField(\"UnitSequenceInCallDispatch\", T.IntegerType(), True),\n",
    "                          T.StructField(\"FirePreventionDistrict\", T.StringType(), True),\n",
    "                          T.StructField(\"SupervisorDistrict\", T.StringType(), True),\n",
    "                          T.StructField(\"Neighborhood\", T.StringType(), True),\n",
    "                          T.StructField(\"Location\", T.StringType(), True),\n",
    "                          T.StructField(\"RowID\", T.StringType(), True),\n",
    "                          T.StructField(\"Delay\", T.FloatType(), True)\n",
    "                          ]\n",
    "                         )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.option('header', True).schema(fire_schema).csv('sf-fire-calls.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>CallNumber</th><th>UnitID</th><th>IncidentNumber</th><th>CallType</th><th>CallDate</th><th>WatchDate</th><th>CallFinalDisposition</th><th>AvailableDtTm</th><th>Address</th><th>City</th><th>Zipcode</th><th>Battalion</th><th>StationArea</th><th>Box</th><th>OriginalPriority</th><th>Priority</th><th>FinalPriority</th><th>ALSUnit</th><th>CallTypeGroup</th><th>NumAlarms</th><th>UnitType</th><th>UnitSequenceInCallDispatch</th><th>FirePreventionDistrict</th><th>SupervisorDistrict</th><th>Neighborhood</th><th>Location</th><th>RowID</th><th>Delay</th></tr>\n",
       "<tr><td>20110016</td><td>T13</td><td>2003235</td><td>Structure Fire</td><td>01/11/2002</td><td>01/10/2002</td><td>Other</td><td>01/11/2002 01:51:...</td><td>2000 Block of CAL...</td><td>SF</td><td>94109</td><td>B04</td><td>38</td><td>3362</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>TRUCK</td><td>2</td><td>4</td><td>5</td><td>Pacific Heights</td><td>(37.7895840679362...</td><td>020110016-T13</td><td>2.95</td></tr>\n",
       "<tr><td>20110022</td><td>M17</td><td>2003241</td><td>Medical Incident</td><td>01/11/2002</td><td>01/10/2002</td><td>Other</td><td>01/11/2002 03:01:...</td><td>0 Block of SILVER...</td><td>SF</td><td>94124</td><td>B10</td><td>42</td><td>6495</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>1</td><td>10</td><td>10</td><td>Bayview Hunters P...</td><td>(37.7337623673897...</td><td>020110022-M17</td><td>4.7</td></tr>\n",
       "<tr><td>20110023</td><td>M41</td><td>2003242</td><td>Medical Incident</td><td>01/11/2002</td><td>01/10/2002</td><td>Other</td><td>01/11/2002 02:39:...</td><td>MARKET ST/MCALLIS...</td><td>SF</td><td>94102</td><td>B03</td><td>01</td><td>1455</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>2</td><td>3</td><td>6</td><td>Tenderloin</td><td>(37.7811772186856...</td><td>020110023-M41</td><td>2.4333334</td></tr>\n",
       "<tr><td>20110032</td><td>E11</td><td>2003250</td><td>Vehicle Fire</td><td>01/11/2002</td><td>01/10/2002</td><td>Other</td><td>01/11/2002 04:16:...</td><td>APPLETON AV/MISSI...</td><td>SF</td><td>94110</td><td>B06</td><td>32</td><td>5626</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>6</td><td>9</td><td>Bernal Heights</td><td>(37.7388432849018...</td><td>020110032-E11</td><td>1.5</td></tr>\n",
       "<tr><td>20110043</td><td>B04</td><td>2003259</td><td>Alarms</td><td>01/11/2002</td><td>01/10/2002</td><td>Other</td><td>01/11/2002 06:01:...</td><td>1400 Block of SUT...</td><td>SF</td><td>94109</td><td>B04</td><td>03</td><td>3223</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>CHIEF</td><td>2</td><td>4</td><td>2</td><td>Western Addition</td><td>(37.7872890372638...</td><td>020110043-B04</td><td>3.4833333</td></tr>\n",
       "<tr><td>20110072</td><td>T08</td><td>2003279</td><td>Structure Fire</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 08:03:...</td><td>BEALE ST/FOLSOM ST</td><td>SF</td><td>94105</td><td>B03</td><td>35</td><td>2122</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>TRUCK</td><td>2</td><td>3</td><td>6</td><td>Financial Distric...</td><td>(37.7886866619654...</td><td>020110072-T08</td><td>1.75</td></tr>\n",
       "<tr><td>20110125</td><td>E33</td><td>2003301</td><td>Alarms</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 09:46:...</td><td>0 Block of FARALL...</td><td>SF</td><td>94112</td><td>B09</td><td>33</td><td>8324</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>2</td><td>9</td><td>11</td><td>Oceanview/Merced/...</td><td>(37.7140353531157...</td><td>020110125-E33</td><td>2.7166667</td></tr>\n",
       "<tr><td>20110130</td><td>E36</td><td>2003304</td><td>Alarms</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 09:58:...</td><td>600 Block of POLK ST</td><td>SF</td><td>94102</td><td>B02</td><td>03</td><td>3114</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>2</td><td>6</td><td>Tenderloin</td><td>(37.7826266328595...</td><td>020110130-E36</td><td>1.7833333</td></tr>\n",
       "<tr><td>20110197</td><td>E05</td><td>2003343</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 12:06:...</td><td>1500 Block of WEB...</td><td>SF</td><td>94115</td><td>B04</td><td>05</td><td>3513</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>4</td><td>5</td><td>Japantown</td><td>(37.784958590666,...</td><td>020110197-E05</td><td>1.5166667</td></tr>\n",
       "<tr><td>20110215</td><td>E06</td><td>2003348</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 01:08:...</td><td>DIAMOND ST/MARKET ST</td><td>SF</td><td>94114</td><td>B05</td><td>06</td><td>5415</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>5</td><td>8</td><td>Castro/Upper Market</td><td>(37.7618954753708...</td><td>020110215-E06</td><td>2.7666667</td></tr>\n",
       "<tr><td>20110274</td><td>M07</td><td>2003381</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 03:31:...</td><td>2700 Block of MIS...</td><td>SF</td><td>94110</td><td>B06</td><td>11</td><td>5525</td><td>1</td><td>1</td><td>2</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>1</td><td>6</td><td>9</td><td>Mission</td><td>(37.7530339738059...</td><td>020110274-M07</td><td>2.1833334</td></tr>\n",
       "<tr><td>20110275</td><td>T15</td><td>2003382</td><td>Structure Fire</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 02:59:...</td><td>BRUNSWICK ST/GUTT...</td><td>SF</td><td>94112</td><td>B09</td><td>43</td><td>6218</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>TRUCK</td><td>1</td><td>9</td><td>11</td><td>Excelsior</td><td>(37.7105545807996...</td><td>020110275-T15</td><td>2.5</td></tr>\n",
       "<tr><td>20110304</td><td>E03</td><td>2003399</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:22:...</td><td>1000 Block of SUT...</td><td>SF</td><td>94109</td><td>B04</td><td>03</td><td>1557</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>4</td><td>3</td><td>Nob Hill</td><td>(37.7881263034393...</td><td>020110304-E03</td><td>2.4166667</td></tr>\n",
       "<tr><td>20110308</td><td>E14</td><td>2003403</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:18:...</td><td>100 Block of 21ST...</td><td>SF</td><td>94121</td><td>B07</td><td>14</td><td>7173</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>7</td><td>1</td><td>Outer Richmond</td><td>(37.7850084431077...</td><td>020110308-E14</td><td>4.95</td></tr>\n",
       "<tr><td>20110313</td><td>B10</td><td>2003408</td><td>Structure Fire</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:09:...</td><td>700 Block of CAPP ST</td><td>SF</td><td>94110</td><td>B06</td><td>07</td><td>5472</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>CHIEF</td><td>6</td><td>6</td><td>9</td><td>Mission</td><td>(37.7547064357942...</td><td>020110313-B10</td><td>1.4166666</td></tr>\n",
       "<tr><td>20110313</td><td>D3</td><td>2003408</td><td>Structure Fire</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:09:...</td><td>700 Block of CAPP ST</td><td>SF</td><td>94110</td><td>B06</td><td>07</td><td>5472</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>CHIEF</td><td>4</td><td>6</td><td>9</td><td>Mission</td><td>(37.7547064357942...</td><td>020110313-D3</td><td>2.5333333</td></tr>\n",
       "<tr><td>20110313</td><td>E32</td><td>2003408</td><td>Structure Fire</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:09:...</td><td>700 Block of CAPP ST</td><td>SF</td><td>94110</td><td>B06</td><td>07</td><td>5472</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>ENGINE</td><td>8</td><td>6</td><td>9</td><td>Mission</td><td>(37.7547064357942...</td><td>020110313-E32</td><td>1.8833333</td></tr>\n",
       "<tr><td>20110315</td><td>RC2</td><td>2003409</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:34:...</td><td>200 Block of LAGU...</td><td>SF</td><td>94116</td><td>B08</td><td>20</td><td>8635</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>RESCUE CAPTAIN</td><td>2</td><td>8</td><td>7</td><td>West of Twin Peaks</td><td>(37.7501117393668...</td><td>020110315-RC2</td><td>5.35</td></tr>\n",
       "<tr><td>20110330</td><td>E14</td><td>2003417</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:51:...</td><td>BALBOA ST/PARK PR...</td><td>SF</td><td>94118</td><td>B07</td><td>31</td><td>7145</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>7</td><td>1</td><td>Inner Richmond</td><td>(37.7768682293368...</td><td>020110330-E14</td><td>2.0</td></tr>\n",
       "<tr><td>20110330</td><td>M12</td><td>2003417</td><td>Medical Incident</td><td>01/11/2002</td><td>01/11/2002</td><td>Other</td><td>01/11/2002 04:51:...</td><td>BALBOA ST/PARK PR...</td><td>SF</td><td>94118</td><td>B07</td><td>31</td><td>7145</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>2</td><td>7</td><td>1</td><td>Inner Richmond</td><td>(37.7768682293368...</td><td>020110330-M12</td><td>1.8166667</td></tr>\n",
       "</table>\n",
       "only showing top 20 rows\n"
      ],
      "text/plain": [
       "+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+\n",
       "|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|      UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|\n",
       "+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+\n",
       "|  20110016|   T13|       2003235|  Structure Fire|01/11/2002|01/10/2002|               Other|01/11/2002 01:51:...|2000 Block of CAL...|  SF|  94109|      B04|         38|3362|               3|       3|            3|  false|         null|        1|         TRUCK|                         2|                     4|                 5|     Pacific Heights|(37.7895840679362...|020110016-T13|     2.95|\n",
       "|  20110022|   M17|       2003241|Medical Incident|01/11/2002|01/10/2002|               Other|01/11/2002 03:01:...|0 Block of SILVER...|  SF|  94124|      B10|         42|6495|               3|       3|            3|   true|         null|        1|         MEDIC|                         1|                    10|                10|Bayview Hunters P...|(37.7337623673897...|020110022-M17|      4.7|\n",
       "|  20110023|   M41|       2003242|Medical Incident|01/11/2002|01/10/2002|               Other|01/11/2002 02:39:...|MARKET ST/MCALLIS...|  SF|  94102|      B03|         01|1455|               3|       3|            3|   true|         null|        1|         MEDIC|                         2|                     3|                 6|          Tenderloin|(37.7811772186856...|020110023-M41|2.4333334|\n",
       "|  20110032|   E11|       2003250|    Vehicle Fire|01/11/2002|01/10/2002|               Other|01/11/2002 04:16:...|APPLETON AV/MISSI...|  SF|  94110|      B06|         32|5626|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     6|                 9|      Bernal Heights|(37.7388432849018...|020110032-E11|      1.5|\n",
       "|  20110043|   B04|       2003259|          Alarms|01/11/2002|01/10/2002|               Other|01/11/2002 06:01:...|1400 Block of SUT...|  SF|  94109|      B04|         03|3223|               3|       3|            3|  false|         null|        1|         CHIEF|                         2|                     4|                 2|    Western Addition|(37.7872890372638...|020110043-B04|3.4833333|\n",
       "|  20110072|   T08|       2003279|  Structure Fire|01/11/2002|01/11/2002|               Other|01/11/2002 08:03:...|  BEALE ST/FOLSOM ST|  SF|  94105|      B03|         35|2122|               3|       3|            3|  false|         null|        1|         TRUCK|                         2|                     3|                 6|Financial Distric...|(37.7886866619654...|020110072-T08|     1.75|\n",
       "|  20110125|   E33|       2003301|          Alarms|01/11/2002|01/11/2002|               Other|01/11/2002 09:46:...|0 Block of FARALL...|  SF|  94112|      B09|         33|8324|               3|       3|            3|  false|         null|        1|        ENGINE|                         2|                     9|                11|Oceanview/Merced/...|(37.7140353531157...|020110125-E33|2.7166667|\n",
       "|  20110130|   E36|       2003304|          Alarms|01/11/2002|01/11/2002|               Other|01/11/2002 09:58:...|600 Block of POLK ST|  SF|  94102|      B02|         03|3114|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     2|                 6|          Tenderloin|(37.7826266328595...|020110130-E36|1.7833333|\n",
       "|  20110197|   E05|       2003343|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 12:06:...|1500 Block of WEB...|  SF|  94115|      B04|         05|3513|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     4|                 5|           Japantown|(37.784958590666,...|020110197-E05|1.5166667|\n",
       "|  20110215|   E06|       2003348|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 01:08:...|DIAMOND ST/MARKET ST|  SF|  94114|      B05|         06|5415|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     5|                 8| Castro/Upper Market|(37.7618954753708...|020110215-E06|2.7666667|\n",
       "|  20110274|   M07|       2003381|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 03:31:...|2700 Block of MIS...|  SF|  94110|      B06|         11|5525|               1|       1|            2|   true|         null|        1|         MEDIC|                         1|                     6|                 9|             Mission|(37.7530339738059...|020110274-M07|2.1833334|\n",
       "|  20110275|   T15|       2003382|  Structure Fire|01/11/2002|01/11/2002|               Other|01/11/2002 02:59:...|BRUNSWICK ST/GUTT...|  SF|  94112|      B09|         43|6218|               3|       3|            3|  false|         null|        1|         TRUCK|                         1|                     9|                11|           Excelsior|(37.7105545807996...|020110275-T15|      2.5|\n",
       "|  20110304|   E03|       2003399|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 04:22:...|1000 Block of SUT...|  SF|  94109|      B04|         03|1557|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     4|                 3|            Nob Hill|(37.7881263034393...|020110304-E03|2.4166667|\n",
       "|  20110308|   E14|       2003403|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 04:18:...|100 Block of 21ST...|  SF|  94121|      B07|         14|7173|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     7|                 1|      Outer Richmond|(37.7850084431077...|020110308-E14|     4.95|\n",
       "|  20110313|   B10|       2003408|  Structure Fire|01/11/2002|01/11/2002|               Other|01/11/2002 04:09:...|700 Block of CAPP ST|  SF|  94110|      B06|         07|5472|               3|       3|            3|  false|         null|        1|         CHIEF|                         6|                     6|                 9|             Mission|(37.7547064357942...|020110313-B10|1.4166666|\n",
       "|  20110313|    D3|       2003408|  Structure Fire|01/11/2002|01/11/2002|               Other|01/11/2002 04:09:...|700 Block of CAPP ST|  SF|  94110|      B06|         07|5472|               3|       3|            3|  false|         null|        1|         CHIEF|                         4|                     6|                 9|             Mission|(37.7547064357942...| 020110313-D3|2.5333333|\n",
       "|  20110313|   E32|       2003408|  Structure Fire|01/11/2002|01/11/2002|               Other|01/11/2002 04:09:...|700 Block of CAPP ST|  SF|  94110|      B06|         07|5472|               3|       3|            3|   true|         null|        1|        ENGINE|                         8|                     6|                 9|             Mission|(37.7547064357942...|020110313-E32|1.8833333|\n",
       "|  20110315|   RC2|       2003409|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 04:34:...|200 Block of LAGU...|  SF|  94116|      B08|         20|8635|               3|       3|            3|   true|         null|        1|RESCUE CAPTAIN|                         2|                     8|                 7|  West of Twin Peaks|(37.7501117393668...|020110315-RC2|     5.35|\n",
       "|  20110330|   E14|       2003417|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 04:51:...|BALBOA ST/PARK PR...|  SF|  94118|      B07|         31|7145|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     7|                 1|      Inner Richmond|(37.7768682293368...|020110330-E14|      2.0|\n",
       "|  20110330|   M12|       2003417|Medical Incident|01/11/2002|01/11/2002|               Other|01/11/2002 04:51:...|BALBOA ST/PARK PR...|  SF|  94118|      B07|         31|7145|               3|       3|            3|   true|         null|        1|         MEDIC|                         2|                     7|                 1|      Inner Richmond|(37.7768682293368...|020110330-M12|1.8166667|\n",
       "+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+\n",
       "only showing top 20 rows"
      ]
   },
   "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实现需求"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1）过滤`CallType == 'Medical Incident'`, 并只打印`\"IncidentNumber\", \"AvailableDtTm\", \"CallType\"` 三个字段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+----------------------+----------------+\n",
      "|IncidentNumber|AvailableDtTm         |callType        |\n",
      "+--------------+----------------------+----------------+\n",
      "|2003241       |01/11/2002 03:01:18 AM|Medical Incident|\n",
      "|2003242       |01/11/2002 02:39:50 AM|Medical Incident|\n",
      "|2003343       |01/11/2002 12:06:57 PM|Medical Incident|\n",
      "|2003348       |01/11/2002 01:08:40 PM|Medical Incident|\n",
      "|2003381       |01/11/2002 03:31:02 PM|Medical Incident|\n",
      "|2003399       |01/11/2002 04:22:49 PM|Medical Incident|\n",
      "|2003403       |01/11/2002 04:18:33 PM|Medical Incident|\n",
      "|2003409       |01/11/2002 04:34:23 PM|Medical Incident|\n",
      "|2003417       |01/11/2002 04:51:31 PM|Medical Incident|\n",
      "|2003417       |01/11/2002 04:51:12 PM|Medical Incident|\n",
      "|2003435       |01/11/2002 05:46:30 PM|Medical Incident|\n",
      "|2003500       |01/11/2002 10:08:48 PM|Medical Incident|\n",
      "|2003529       |01/11/2002 10:56:59 PM|Medical Incident|\n",
      "|2003550       |01/12/2002 02:04:06 AM|Medical Incident|\n",
      "|2003576       |01/12/2002 04:17:22 AM|Medical Incident|\n",
      "|2003577       |01/12/2002 04:23:31 AM|Medical Incident|\n",
      "|2003584       |01/12/2002 06:27:31 AM|Medical Incident|\n",
      "|2003593       |null                  |Medical Incident|\n",
      "|2003630       |01/12/2002 11:28:40 AM|Medical Incident|\n",
      "|2003639       |01/12/2002 12:15:25 PM|Medical Incident|\n",
      "+--------------+----------------------+----------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('IncidentNumber', 'AvailableDtTm', 'callType')\\\n",
    "    .where(\"CallType == 'Medical Incident'\").show(truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2）过滤掉 `CallType` 为空的数据，并统计唯一 `CallType` 的个数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------+\n",
      "|count(CallType)|\n",
      "+---------------+\n",
      "|             30|\n",
      "+---------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('CallType')\\\n",
    "    .where(F.col('CallType').isNotNull())\\\n",
    "    .agg(F.countDistinct('CallType')).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3）过滤掉 `CallType` 为空的数据，显示所有的 `CallType` 并去重"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|            CallType|\n",
      "+--------------------+\n",
      "|Elevator / Escala...|\n",
      "|         Marine Fire|\n",
      "|  Aircraft Emergency|\n",
      "|Confined Space / ...|\n",
      "|      Administrative|\n",
      "|              Alarms|\n",
      "|Odor (Strange / U...|\n",
      "|Citizen Assist / ...|\n",
      "|              HazMat|\n",
      "|Watercraft in Dis...|\n",
      "|           Explosion|\n",
      "|           Oil Spill|\n",
      "|        Vehicle Fire|\n",
      "|  Suspicious Package|\n",
      "|Extrication / Ent...|\n",
      "|               Other|\n",
      "|        Outside Fire|\n",
      "|   Traffic Collision|\n",
      "|       Assist Police|\n",
      "|Gas Leak (Natural...|\n",
      "+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('CallType') \\\n",
    "    .where(F.col('CallType').isNotNull()) \\\n",
    "    .distinct().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4）重命名 `Delay` 为 `ResponseDelayedinMins`，并过滤出延误大于 $5$ 分钟的记录，只打印 `ResponseDelayedinMins` 字段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+\n",
      "|ResponseDelayedMins|\n",
      "+-------------------+\n",
      "|               5.35|\n",
      "|               6.25|\n",
      "|                5.2|\n",
      "|                5.6|\n",
      "|               7.25|\n",
      "|          11.916667|\n",
      "|           5.116667|\n",
      "|           8.633333|\n",
      "|           95.28333|\n",
      "|               5.45|\n",
      "|                7.6|\n",
      "|           6.133333|\n",
      "|          5.1833334|\n",
      "|          6.9166665|\n",
      "|                5.2|\n",
      "|               6.35|\n",
      "|           7.983333|\n",
      "|              13.55|\n",
      "|               5.15|\n",
      "|          13.583333|\n",
      "+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.withColumnRenamed('Delay', 'ResponseDelayedMins') \\\n",
    "    .where('ResponseDelayedMins > 5') \\\n",
    "    .select('ResponseDelayedMins').show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "5）转换 `IncidentDate`、`OnWatchDate`、`AvailableDtTS` 为日期格式，并删除掉行 `AvailableDtTm`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+-------------------+-------------------+\n",
      "|       IncidentDate|        OnWatchDate|      AvailableDtTS|\n",
      "+-------------------+-------------------+-------------------+\n",
      "|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|\n",
      "|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|\n",
      "|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|\n",
      "|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|\n",
      "|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|\n",
      "+-------------------+-------------------+-------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cleaned_df = df.withColumn('IncidentDate', F.to_timestamp(F.col('CallDate'), 'MM/dd/yyyy')) \\\n",
    "    .drop('CallDate') \\\n",
    "    .withColumn(\"OnWatchDate\", F.to_timestamp(F.col(\"WatchDate\"), \"MM/dd/yyyy\")) \\\n",
    "    .drop(\"WatchDate\") \\\n",
    "    .withColumn(\"AvailableDtTS\", F.to_timestamp(F.col(\"AvailableDtTm\"), \"MM/dd/yyyy hh:mm:ss a\")) \\\n",
    "    .drop(\"AvailableDtTm\")\n",
    " \n",
    "cleaned_df.select(\"IncidentDate\", \"OnWatchDate\", \"AvailableDtTS\").show(5, True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>CallNumber</th><th>UnitID</th><th>IncidentNumber</th><th>CallType</th><th>CallFinalDisposition</th><th>Address</th><th>City</th><th>Zipcode</th><th>Battalion</th><th>StationArea</th><th>Box</th><th>OriginalPriority</th><th>Priority</th><th>FinalPriority</th><th>ALSUnit</th><th>CallTypeGroup</th><th>NumAlarms</th><th>UnitType</th><th>UnitSequenceInCallDispatch</th><th>FirePreventionDistrict</th><th>SupervisorDistrict</th><th>Neighborhood</th><th>Location</th><th>RowID</th><th>Delay</th><th>IncidentDate</th><th>OnWatchDate</th><th>AvailableDtTS</th></tr>\n",
       "<tr><td>20110016</td><td>T13</td><td>2003235</td><td>Structure Fire</td><td>Other</td><td>2000 Block of CAL...</td><td>SF</td><td>94109</td><td>B04</td><td>38</td><td>3362</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>TRUCK</td><td>2</td><td>4</td><td>5</td><td>Pacific Heights</td><td>(37.7895840679362...</td><td>020110016-T13</td><td>2.95</td><td>2002-01-11 00:00:00</td><td>2002-01-10 00:00:00</td><td>2002-01-11 01:51:44</td></tr>\n",
       "<tr><td>20110022</td><td>M17</td><td>2003241</td><td>Medical Incident</td><td>Other</td><td>0 Block of SILVER...</td><td>SF</td><td>94124</td><td>B10</td><td>42</td><td>6495</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>1</td><td>10</td><td>10</td><td>Bayview Hunters P...</td><td>(37.7337623673897...</td><td>020110022-M17</td><td>4.7</td><td>2002-01-11 00:00:00</td><td>2002-01-10 00:00:00</td><td>2002-01-11 03:01:18</td></tr>\n",
       "<tr><td>20110023</td><td>M41</td><td>2003242</td><td>Medical Incident</td><td>Other</td><td>MARKET ST/MCALLIS...</td><td>SF</td><td>94102</td><td>B03</td><td>01</td><td>1455</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>2</td><td>3</td><td>6</td><td>Tenderloin</td><td>(37.7811772186856...</td><td>020110023-M41</td><td>2.4333334</td><td>2002-01-11 00:00:00</td><td>2002-01-10 00:00:00</td><td>2002-01-11 02:39:50</td></tr>\n",
       "<tr><td>20110032</td><td>E11</td><td>2003250</td><td>Vehicle Fire</td><td>Other</td><td>APPLETON AV/MISSI...</td><td>SF</td><td>94110</td><td>B06</td><td>32</td><td>5626</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>6</td><td>9</td><td>Bernal Heights</td><td>(37.7388432849018...</td><td>020110032-E11</td><td>1.5</td><td>2002-01-11 00:00:00</td><td>2002-01-10 00:00:00</td><td>2002-01-11 04:16:46</td></tr>\n",
       "<tr><td>20110043</td><td>B04</td><td>2003259</td><td>Alarms</td><td>Other</td><td>1400 Block of SUT...</td><td>SF</td><td>94109</td><td>B04</td><td>03</td><td>3223</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>CHIEF</td><td>2</td><td>4</td><td>2</td><td>Western Addition</td><td>(37.7872890372638...</td><td>020110043-B04</td><td>3.4833333</td><td>2002-01-11 00:00:00</td><td>2002-01-10 00:00:00</td><td>2002-01-11 06:01:58</td></tr>\n",
       "<tr><td>20110072</td><td>T08</td><td>2003279</td><td>Structure Fire</td><td>Other</td><td>BEALE ST/FOLSOM ST</td><td>SF</td><td>94105</td><td>B03</td><td>35</td><td>2122</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>TRUCK</td><td>2</td><td>3</td><td>6</td><td>Financial Distric...</td><td>(37.7886866619654...</td><td>020110072-T08</td><td>1.75</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 08:03:26</td></tr>\n",
       "<tr><td>20110125</td><td>E33</td><td>2003301</td><td>Alarms</td><td>Other</td><td>0 Block of FARALL...</td><td>SF</td><td>94112</td><td>B09</td><td>33</td><td>8324</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>2</td><td>9</td><td>11</td><td>Oceanview/Merced/...</td><td>(37.7140353531157...</td><td>020110125-E33</td><td>2.7166667</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 09:46:44</td></tr>\n",
       "<tr><td>20110130</td><td>E36</td><td>2003304</td><td>Alarms</td><td>Other</td><td>600 Block of POLK ST</td><td>SF</td><td>94102</td><td>B02</td><td>03</td><td>3114</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>2</td><td>6</td><td>Tenderloin</td><td>(37.7826266328595...</td><td>020110130-E36</td><td>1.7833333</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 09:58:53</td></tr>\n",
       "<tr><td>20110197</td><td>E05</td><td>2003343</td><td>Medical Incident</td><td>Other</td><td>1500 Block of WEB...</td><td>SF</td><td>94115</td><td>B04</td><td>05</td><td>3513</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>4</td><td>5</td><td>Japantown</td><td>(37.784958590666,...</td><td>020110197-E05</td><td>1.5166667</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 12:06:57</td></tr>\n",
       "<tr><td>20110215</td><td>E06</td><td>2003348</td><td>Medical Incident</td><td>Other</td><td>DIAMOND ST/MARKET ST</td><td>SF</td><td>94114</td><td>B05</td><td>06</td><td>5415</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>5</td><td>8</td><td>Castro/Upper Market</td><td>(37.7618954753708...</td><td>020110215-E06</td><td>2.7666667</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 13:08:40</td></tr>\n",
       "<tr><td>20110274</td><td>M07</td><td>2003381</td><td>Medical Incident</td><td>Other</td><td>2700 Block of MIS...</td><td>SF</td><td>94110</td><td>B06</td><td>11</td><td>5525</td><td>1</td><td>1</td><td>2</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>1</td><td>6</td><td>9</td><td>Mission</td><td>(37.7530339738059...</td><td>020110274-M07</td><td>2.1833334</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 15:31:02</td></tr>\n",
       "<tr><td>20110275</td><td>T15</td><td>2003382</td><td>Structure Fire</td><td>Other</td><td>BRUNSWICK ST/GUTT...</td><td>SF</td><td>94112</td><td>B09</td><td>43</td><td>6218</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>TRUCK</td><td>1</td><td>9</td><td>11</td><td>Excelsior</td><td>(37.7105545807996...</td><td>020110275-T15</td><td>2.5</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 14:59:04</td></tr>\n",
       "<tr><td>20110304</td><td>E03</td><td>2003399</td><td>Medical Incident</td><td>Other</td><td>1000 Block of SUT...</td><td>SF</td><td>94109</td><td>B04</td><td>03</td><td>1557</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>4</td><td>3</td><td>Nob Hill</td><td>(37.7881263034393...</td><td>020110304-E03</td><td>2.4166667</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:22:49</td></tr>\n",
       "<tr><td>20110308</td><td>E14</td><td>2003403</td><td>Medical Incident</td><td>Other</td><td>100 Block of 21ST...</td><td>SF</td><td>94121</td><td>B07</td><td>14</td><td>7173</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>7</td><td>1</td><td>Outer Richmond</td><td>(37.7850084431077...</td><td>020110308-E14</td><td>4.95</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:18:33</td></tr>\n",
       "<tr><td>20110313</td><td>B10</td><td>2003408</td><td>Structure Fire</td><td>Other</td><td>700 Block of CAPP ST</td><td>SF</td><td>94110</td><td>B06</td><td>07</td><td>5472</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>CHIEF</td><td>6</td><td>6</td><td>9</td><td>Mission</td><td>(37.7547064357942...</td><td>020110313-B10</td><td>1.4166666</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:09:08</td></tr>\n",
       "<tr><td>20110313</td><td>D3</td><td>2003408</td><td>Structure Fire</td><td>Other</td><td>700 Block of CAPP ST</td><td>SF</td><td>94110</td><td>B06</td><td>07</td><td>5472</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>CHIEF</td><td>4</td><td>6</td><td>9</td><td>Mission</td><td>(37.7547064357942...</td><td>020110313-D3</td><td>2.5333333</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:09:08</td></tr>\n",
       "<tr><td>20110313</td><td>E32</td><td>2003408</td><td>Structure Fire</td><td>Other</td><td>700 Block of CAPP ST</td><td>SF</td><td>94110</td><td>B06</td><td>07</td><td>5472</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>ENGINE</td><td>8</td><td>6</td><td>9</td><td>Mission</td><td>(37.7547064357942...</td><td>020110313-E32</td><td>1.8833333</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:09:08</td></tr>\n",
       "<tr><td>20110315</td><td>RC2</td><td>2003409</td><td>Medical Incident</td><td>Other</td><td>200 Block of LAGU...</td><td>SF</td><td>94116</td><td>B08</td><td>20</td><td>8635</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>RESCUE CAPTAIN</td><td>2</td><td>8</td><td>7</td><td>West of Twin Peaks</td><td>(37.7501117393668...</td><td>020110315-RC2</td><td>5.35</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:34:23</td></tr>\n",
       "<tr><td>20110330</td><td>E14</td><td>2003417</td><td>Medical Incident</td><td>Other</td><td>BALBOA ST/PARK PR...</td><td>SF</td><td>94118</td><td>B07</td><td>31</td><td>7145</td><td>3</td><td>3</td><td>3</td><td>false</td><td>null</td><td>1</td><td>ENGINE</td><td>1</td><td>7</td><td>1</td><td>Inner Richmond</td><td>(37.7768682293368...</td><td>020110330-E14</td><td>2.0</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:51:31</td></tr>\n",
       "<tr><td>20110330</td><td>M12</td><td>2003417</td><td>Medical Incident</td><td>Other</td><td>BALBOA ST/PARK PR...</td><td>SF</td><td>94118</td><td>B07</td><td>31</td><td>7145</td><td>3</td><td>3</td><td>3</td><td>true</td><td>null</td><td>1</td><td>MEDIC</td><td>2</td><td>7</td><td>1</td><td>Inner Richmond</td><td>(37.7768682293368...</td><td>020110330-M12</td><td>1.8166667</td><td>2002-01-11 00:00:00</td><td>2002-01-11 00:00:00</td><td>2002-01-11 16:51:12</td></tr>\n",
       "</table>\n",
       "only showing top 20 rows\n"
      ],
      "text/plain": [
       "+----------+------+--------------+----------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+-------------------+-------------------+-------------------+\n",
       "|CallNumber|UnitID|IncidentNumber|        CallType|CallFinalDisposition|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|      UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|       IncidentDate|        OnWatchDate|      AvailableDtTS|\n",
       "+----------+------+--------------+----------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+-------------------+-------------------+-------------------+\n",
       "|  20110016|   T13|       2003235|  Structure Fire|               Other|2000 Block of CAL...|  SF|  94109|      B04|         38|3362|               3|       3|            3|  false|         null|        1|         TRUCK|                         2|                     4|                 5|     Pacific Heights|(37.7895840679362...|020110016-T13|     2.95|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|\n",
       "|  20110022|   M17|       2003241|Medical Incident|               Other|0 Block of SILVER...|  SF|  94124|      B10|         42|6495|               3|       3|            3|   true|         null|        1|         MEDIC|                         1|                    10|                10|Bayview Hunters P...|(37.7337623673897...|020110022-M17|      4.7|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|\n",
       "|  20110023|   M41|       2003242|Medical Incident|               Other|MARKET ST/MCALLIS...|  SF|  94102|      B03|         01|1455|               3|       3|            3|   true|         null|        1|         MEDIC|                         2|                     3|                 6|          Tenderloin|(37.7811772186856...|020110023-M41|2.4333334|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|\n",
       "|  20110032|   E11|       2003250|    Vehicle Fire|               Other|APPLETON AV/MISSI...|  SF|  94110|      B06|         32|5626|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     6|                 9|      Bernal Heights|(37.7388432849018...|020110032-E11|      1.5|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|\n",
       "|  20110043|   B04|       2003259|          Alarms|               Other|1400 Block of SUT...|  SF|  94109|      B04|         03|3223|               3|       3|            3|  false|         null|        1|         CHIEF|                         2|                     4|                 2|    Western Addition|(37.7872890372638...|020110043-B04|3.4833333|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|\n",
       "|  20110072|   T08|       2003279|  Structure Fire|               Other|  BEALE ST/FOLSOM ST|  SF|  94105|      B03|         35|2122|               3|       3|            3|  false|         null|        1|         TRUCK|                         2|                     3|                 6|Financial Distric...|(37.7886866619654...|020110072-T08|     1.75|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 08:03:26|\n",
       "|  20110125|   E33|       2003301|          Alarms|               Other|0 Block of FARALL...|  SF|  94112|      B09|         33|8324|               3|       3|            3|  false|         null|        1|        ENGINE|                         2|                     9|                11|Oceanview/Merced/...|(37.7140353531157...|020110125-E33|2.7166667|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 09:46:44|\n",
       "|  20110130|   E36|       2003304|          Alarms|               Other|600 Block of POLK ST|  SF|  94102|      B02|         03|3114|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     2|                 6|          Tenderloin|(37.7826266328595...|020110130-E36|1.7833333|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 09:58:53|\n",
       "|  20110197|   E05|       2003343|Medical Incident|               Other|1500 Block of WEB...|  SF|  94115|      B04|         05|3513|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     4|                 5|           Japantown|(37.784958590666,...|020110197-E05|1.5166667|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 12:06:57|\n",
       "|  20110215|   E06|       2003348|Medical Incident|               Other|DIAMOND ST/MARKET ST|  SF|  94114|      B05|         06|5415|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     5|                 8| Castro/Upper Market|(37.7618954753708...|020110215-E06|2.7666667|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 13:08:40|\n",
       "|  20110274|   M07|       2003381|Medical Incident|               Other|2700 Block of MIS...|  SF|  94110|      B06|         11|5525|               1|       1|            2|   true|         null|        1|         MEDIC|                         1|                     6|                 9|             Mission|(37.7530339738059...|020110274-M07|2.1833334|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 15:31:02|\n",
       "|  20110275|   T15|       2003382|  Structure Fire|               Other|BRUNSWICK ST/GUTT...|  SF|  94112|      B09|         43|6218|               3|       3|            3|  false|         null|        1|         TRUCK|                         1|                     9|                11|           Excelsior|(37.7105545807996...|020110275-T15|      2.5|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 14:59:04|\n",
       "|  20110304|   E03|       2003399|Medical Incident|               Other|1000 Block of SUT...|  SF|  94109|      B04|         03|1557|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     4|                 3|            Nob Hill|(37.7881263034393...|020110304-E03|2.4166667|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:22:49|\n",
       "|  20110308|   E14|       2003403|Medical Incident|               Other|100 Block of 21ST...|  SF|  94121|      B07|         14|7173|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     7|                 1|      Outer Richmond|(37.7850084431077...|020110308-E14|     4.95|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:18:33|\n",
       "|  20110313|   B10|       2003408|  Structure Fire|               Other|700 Block of CAPP ST|  SF|  94110|      B06|         07|5472|               3|       3|            3|  false|         null|        1|         CHIEF|                         6|                     6|                 9|             Mission|(37.7547064357942...|020110313-B10|1.4166666|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:09:08|\n",
       "|  20110313|    D3|       2003408|  Structure Fire|               Other|700 Block of CAPP ST|  SF|  94110|      B06|         07|5472|               3|       3|            3|  false|         null|        1|         CHIEF|                         4|                     6|                 9|             Mission|(37.7547064357942...| 020110313-D3|2.5333333|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:09:08|\n",
       "|  20110313|   E32|       2003408|  Structure Fire|               Other|700 Block of CAPP ST|  SF|  94110|      B06|         07|5472|               3|       3|            3|   true|         null|        1|        ENGINE|                         8|                     6|                 9|             Mission|(37.7547064357942...|020110313-E32|1.8833333|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:09:08|\n",
       "|  20110315|   RC2|       2003409|Medical Incident|               Other|200 Block of LAGU...|  SF|  94116|      B08|         20|8635|               3|       3|            3|   true|         null|        1|RESCUE CAPTAIN|                         2|                     8|                 7|  West of Twin Peaks|(37.7501117393668...|020110315-RC2|     5.35|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:34:23|\n",
       "|  20110330|   E14|       2003417|Medical Incident|               Other|BALBOA ST/PARK PR...|  SF|  94118|      B07|         31|7145|               3|       3|            3|  false|         null|        1|        ENGINE|                         1|                     7|                 1|      Inner Richmond|(37.7768682293368...|020110330-E14|      2.0|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:51:31|\n",
       "|  20110330|   M12|       2003417|Medical Incident|               Other|BALBOA ST/PARK PR...|  SF|  94118|      B07|         31|7145|               3|       3|            3|   true|         null|        1|         MEDIC|                         2|                     7|                 1|      Inner Richmond|(37.7768682293368...|020110330-M12|1.8166667|2002-01-11 00:00:00|2002-01-11 00:00:00|2002-01-11 16:51:12|\n",
       "+----------+------+--------------+----------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+-------------------+-------------------+-------------------+\n",
       "only showing top 20 rows"
      ]
   },
   "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+\n",
      "|year|\n",
      "+----+\n",
      "|2000|\n",
      "|2001|\n",
      "|2002|\n",
      "|2003|\n",
      "|2004|\n",
      "|2005|\n",
      "|2006|\n",
      "|2007|\n",
      "|2008|\n",
      "|2009|\n",
      "|2010|\n",
      "|2011|\n",
      "|2012|\n",
      "|2013|\n",
      "|2014|\n",
      "|2015|\n",
      "|2016|\n",
      "|2017|\n",
      "|2018|\n",
      "+----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 6）显示所有有事故的年份，并按照年份从小到到排序\n",
    "cleaned_df.select(F.year('IncidentDate').alias('year')) \\\n",
    "    .distinct() \\\n",
    "    .orderBy(F.col('year').asc()) \\\n",
    "    .show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------------------+------+\n",
      "|CallType                       |count |\n",
      "+-------------------------------+------+\n",
      "|Medical Incident               |113794|\n",
      "|Structure Fire                 |23319 |\n",
      "|Alarms                         |19406 |\n",
      "|Traffic Collision              |7013  |\n",
      "|Citizen Assist / Service Call  |2524  |\n",
      "|Other                          |2166  |\n",
      "|Outside Fire                   |2094  |\n",
      "|Vehicle Fire                   |854   |\n",
      "|Gas Leak (Natural and LP Gases)|764   |\n",
      "|Water Rescue                   |755   |\n",
      "|Odor (Strange / Unknown)       |490   |\n",
      "|Electrical Hazard              |482   |\n",
      "|Elevator / Escalator Rescue    |453   |\n",
      "|Smoke Investigation (Outside)  |391   |\n",
      "|Fuel Spill                     |193   |\n",
      "|HazMat                         |124   |\n",
      "|Industrial Accidents           |94    |\n",
      "|Explosion                      |89    |\n",
      "|Train / Rail Incident          |57    |\n",
      "|Aircraft Emergency             |36    |\n",
      "+-------------------------------+------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 7）过滤CallType为空的记录，并统计每种CallType的类型总数并按照顺序倒排\n",
    "df.where(F.col('CallType').isNotNull())\\\n",
    "    .groupby('CallType') \\\n",
    "    .agg(F.count('CallType').alias('count')) \\\n",
    "    .orderBy(F.col('count').desc()).show(truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 作业\n",
    "\n",
    "1. 打印 2018 年份所有的 `CallType`，并去重\n",
    "2. 2018 年的哪个月份有最高的火警\n",
    "3. San Francisco 的哪个 neighborhood 在 2018 年发生的火灾次数最多？\n",
    "4. San Francisco 的哪个 neighborhood 在 2018 年响应最慢？\n",
    "5. 2018 年的哪一周的火警次数最多?\n",
    "6. `neighborhood` `zip code` 和 `number of fire calls` 有关联（correlation）吗？\n",
    "7. 实现使用 `parquest` 存储并读取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>CallType</th></tr>\n",
       "<tr><td>Alarms</td></tr>\n",
       "<tr><td>Assist Police</td></tr>\n",
       "<tr><td>Citizen Assist / ...</td></tr>\n",
       "<tr><td>Electrical Hazard</td></tr>\n",
       "<tr><td>Elevator / Escala...</td></tr>\n",
       "<tr><td>Explosion</td></tr>\n",
       "<tr><td>Fuel Spill</td></tr>\n",
       "<tr><td>Gas Leak (Natural...</td></tr>\n",
       "<tr><td>HazMat</td></tr>\n",
       "<tr><td>Medical Incident</td></tr>\n",
       "<tr><td>Odor (Strange / U...</td></tr>\n",
       "<tr><td>Other</td></tr>\n",
       "<tr><td>Outside Fire</td></tr>\n",
       "<tr><td>Smoke Investigati...</td></tr>\n",
       "<tr><td>Structure Fire</td></tr>\n",
       "<tr><td>Suspicious Package</td></tr>\n",
       "<tr><td>Traffic Collision</td></tr>\n",
       "<tr><td>Train / Rail Inci...</td></tr>\n",
       "<tr><td>Vehicle Fire</td></tr>\n",
       "<tr><td>Water Rescue</td></tr>\n",
       "</table>\n"
      ],
      "text/plain": [
       "+--------------------+\n",
       "|            CallType|\n",
       "+--------------------+\n",
       "|              Alarms|\n",
       "|       Assist Police|\n",
       "|Citizen Assist / ...|\n",
       "|   Electrical Hazard|\n",
       "|Elevator / Escala...|\n",
       "|           Explosion|\n",
       "|          Fuel Spill|\n",
       "|Gas Leak (Natural...|\n",
       "|              HazMat|\n",
       "|    Medical Incident|\n",
       "|Odor (Strange / U...|\n",
       "|               Other|\n",
       "|        Outside Fire|\n",
       "|Smoke Investigati...|\n",
       "|      Structure Fire|\n",
       "|  Suspicious Package|\n",
       "|   Traffic Collision|\n",
       "|Train / Rail Inci...|\n",
       "|        Vehicle Fire|\n",
       "|        Water Rescue|\n",
       "+--------------------+"
      ]
   },
   "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_df.select(\"CallType\") \\\n",
    "    .where(F.year('IncidentDate') == 2018) \\\n",
    "    .distinct() \\\n",
    "    .sort(\"CallType\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>month</th><th>count</th></tr>\n",
       "<tr><td>10</td><td>1068</td></tr>\n",
       "</table>\n"
      ],
      "text/plain": [
       "+-----+-----+\n",
       "|month|count|\n",
       "+-----+-----+\n",
       "|   10| 1068|\n",
       "+-----+-----+"
      ]
   },
   "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_df.select(F.month(\"IncidentDate\")\\\n",
    "    .alias(\"month\"))\\\n",
    "    .where(F.year(\"IncidentDate\") == 2018)\\\n",
    "    .groupBy(\"month\")\\\n",
    "    .count()\\\n",
    "    .orderBy(F.desc(\"count\"))\\\n",
    "    .limit(1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>Neighborhood</th><th>count</th></tr>\n",
       "<tr><td>Tenderloin</td><td>1393</td></tr>\n",
       "</table>\n"
      ],
      "text/plain": [
       "+------------+-----+\n",
       "|Neighborhood|count|\n",
       "+------------+-----+\n",
       "|  Tenderloin| 1393|\n",
       "+------------+-----+"
      ]
   },
   "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_df.select(\"Neighborhood\", \"IncidentDate\").where(F.col(\"City\") == \"San Francisco\").where(F.year(\"IncidentDate\")==2018).groupBy(\"Neighborhood\").count().orderBy(F.desc(\"count\")).limit(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>Neighborhood</th><th>Delay</th><th>IncidentDate</th></tr>\n",
       "<tr><td>Chinatown</td><td>491.26666</td><td>2018-03-17 00:00:00</td></tr>\n",
       "</table>\n"
      ],
      "text/plain": [
       "+------------+---------+-------------------+\n",
       "|Neighborhood|    Delay|       IncidentDate|\n",
       "+------------+---------+-------------------+\n",
       "|   Chinatown|491.26666|2018-03-17 00:00:00|\n",
       "+------------+---------+-------------------+"
      ]
   },
   "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
   "cleaned_df.select(\"Neighborhood\", \"Delay\", \"IncidentDate\").where(F.col(\"City\") == \"San Francisco\").where(F.year(\"IncidentDate\")==2018).orderBy(\"Delay\", ascending=False).limit(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>Neighborhood</th><th>averageDelay</th></tr>\n",
       "<tr><td>Treasure Island</td><td>11.320833250880241</td></tr>\n",
       "</table>\n"
      ],
      "text/plain": [
       "+---------------+------------------+\n",
       "|   Neighborhood|      averageDelay|\n",
       "+---------------+------------------+\n",
       "|Treasure Island|11.320833250880241|\n",
       "+---------------+------------------+"
      ]
   },
   "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 延迟平均值\n",
    "cleaned_df.select(\"Neighborhood\", \"Delay\", \"IncidentDate\")\\\n",
    "    .where(F.col(\"City\") == \"San Francisco\")\\\n",
    "        .where(F.year(\"IncidentDate\")==2018)\\\n",
    "            .groupBy(\"Neighborhood\")\\\n",
    "                .agg(F.avg(\"Delay\").alias(\"averageDelay\"))\\\n",
    "                    .orderBy(\"averageDelay\", ascending=False)\\\n",
    "                        .limit(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border='1'>\n",
       "<tr><th>week</th><th>count</th></tr>\n",
       "<tr><td>45</td><td>64</td></tr>\n",
       "</table>\n"
      ],
      "text/plain": [
       "+----+-----+\n",
       "|week|count|\n",
       "+----+-----+\n",
       "|  45|   64|\n",
       "+----+-----+"
      ]
   },
   "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_df.select(F.weekofyear(\"IncidentDate\")\\\n",
    "    .alias(\"week\"))\\\n",
    "        .where(F.year(\"IncidentDate\") == 2018)\\\n",
    "            .groupBy(\"week\").count()\\\n",
   "                .orderBy(\"count\").limit(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.2150447213598307"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_df.select(\"ZipCode\", \"NumAlarms\")\\\n",
    "    .groupBy(\"ZipCode\") \\\n",
    "    .sum(\"NumAlarms\").corr(\"ZipCode\", \"sum(NumAlarms)\")\n",
   "# 0.2150447213598307 相关系数接近 0，可以认为无关。"
   ]
  }
 ],
 "metadata": {
"kernelspec": {
   "display_name": "Python 3.7.13 ('pyspark2')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
   "hash": "48e87896ed641b154b37119b199fe5cbef001faa1d3cf1f339fb46433cd23b9b"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
